Final assignment presentation

Alexander Kraess, Andrew Wells, Julian Kath & Lorenzo Gini

11/18/2022

Research question

  • between 2016 and 2022 there have been major shifts in terms of majorities in the US congress:

    • in 2016, both chambers were hold by Republicans

    • in 2018, the Democrats gained a majority in Congress

    • in 2020 the Democrats gained Congress and Senate

  • While one might expect that the post 2016 and 2020 congresses will vary in their policies. However, it is interesting to also focus on the period between 2018 and 2020: When both chambers had different majorities and needed to cooperate.

  • We will focus on the question on whether and how the different majorities had an impact on the policies that have been passed by Congress.

Scraping and cleaning the data

We scraped our data from: https://data.gov/developers/apis/index.html

df <- read_csv("https://raw.githubusercontent.com/juka19/tad_assignment3/main/data/data_11_28.csv")
## New names:
## Rows: 920 Columns: 13
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (3): subjects, summary, policy_area dbl (8): ...1, ...2, ...3, Unnamed: 0, bill
## number, cosponsor_D_perc, cospo... date (2): latest_action, date
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
## * `...1` -> `...2`
## * `...2` -> `...3`
df_cosp <- read_csv("https://raw.githubusercontent.com/juka19/tad_assignment3/main/data/cosponsors_sponsors.csv")
## New names:
## Rows: 21448 Columns: 4
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (2): cosponsor_name, cosponsor_party dbl (2): ...1, number
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
df_bills <- read_csv("data\\old_data\\all_bills.csv")
## New names:
## Rows: 986 Columns: 13
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (6): latestAction, originChamber, originChamberCode, title, type, url dbl (5):
## ...1, index, Unnamed: 0, congress, number dttm (1): updateDateIncludingText
## date (1): updateDate
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
head(df, 5)
## # A tibble: 5 x 13
##    ...1  ...2  ...3 `Unnamed: 0` `bill number` subjects      summary policy_area
##   <dbl> <dbl> <dbl>        <dbl>         <dbl> <chr>         <chr>   <chr>      
## 1     1     1     0            0          4996 "{'legislati~ "Bankr~ Finance an~
## 2     2     2     1            1          8906 "{'legislati~ "Lifes~ Health     
## 3     3     3     2            2          8810 "{'legislati~ "Natio~ Emergency ~
## 4     4     4     3            3          8611 "{'legislati~ "Desig~ Government~
## 5     5     5     4            4          8354 "{'legislati~ "Servi~ Civil Righ~
## # ... with 5 more variables: latest_action <date>, cosponsor_D_perc <dbl>,
## #   cosponsor_R_perc <dbl>, date <date>, session <dbl>

Creating party variable

#if two thirds of the sponsors are democrats, we consider the bill democrat-dominated
#same for republicans
#if there is no clear majority, they are "Both"

df$party <- ifelse(df$cosponsor_D_perc > 0.66, "Democrat", ifelse(df$cosponsor_R_perc > 0.66, "Republican", "Both"))

Summary statistics

Density of cosponsors

ggplot(df, aes(x = cosponsor_D_perc)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.1, fill="blue") +
  labs(title="Density of bill cosposor party",
       x ="Cosponsor party composition", y = "Density", 
       caption = "Numbers represent proportion of cosponsors from Democratic party, 
       so 0.0 represents bills that were fully Republican and 1.0 represents 
       bills that were fully Democrat.") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Policy fields

fromJSON(file = "slides\\plots\\polar_line_plot") %>% plotly::as_widget()
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...

Creating a corpus

df_corp <- df
df_corp <- df_corp %>% rename(text = summary)
corp <- corpus(df_corp)
## Warning: NA is replaced by empty string

Creatung a dfm from the corpus

dfmat <- corp %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
  tokens_remove(patter = stopwords("en")) %>%
  tokens_replace(pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma) %>%
  tokens_wordstem() %>%
  tokens_remove(c("sec","bill","act", "section", "funds", "shall","must", "use", "author","fund","provid","program","requir","divis","titl","appropri","specifi")) %>%
  dfm()

Wordclouds

Most common words in all congresses

dfmatCon <- dfm(corp, remove = stopwords("english"), remove_numbers = TRUE, remove_punct = TRUE, groups = corp$session) %>%dfm_remove(c("sec","bill","act", "section", "funds", "shall","must", "used")) %>%
  dfm_trim(min_termfreq = 3)

textplot_wordcloud(dfmatCon, comparison = TRUE, max_words = 300,
                   color = c("blue", "red"))

#Wordcloud congress 115

Comparing the 115th and 116th congress

dfmat_115 <- dfm_subset(dfmat, session == 115)
corp_115 <- df %>% filter(session == 115) %>% rename(text = summary) %>% corpus()
modelpart15 <- dfm(corp_115, remove = stopwords("english"),remove_numbers = TRUE, remove_punct = TRUE, groups = corp_115$party) %>%
   dfm_remove(c("sec","bill","act", "section", "funds", "shall","must", "used")) %>%
  dfm_trim(min_termfreq = 3)

mp15 <- textplot_wordcloud(modelpart15, comparison = TRUE, max_words = 300,
                   color = c("green","blue", "red"))

corp_116 <- df %>% filter(session == 116) %>% rename(text = summary) %>% corpus()
modelpart16 <- dfm(corp_116, remove = stopwords("english"), remove_numbers = TRUE, remove_punct = TRUE, groups = corp_116$party) %>% dfm_remove(c("sec","bill","act", "section", "funds", "shall","must", "used")) %>%
  dfm_trim(min_termfreq = 3)

mp16 <- textplot_wordcloud(modelpart16, comparison = TRUE, max_words = 300,
                   color = c("green","blue", "red"))

Dimensionality plotting

corp2 <- corpus(df$summary)
## Warning: NA is replaced by empty string
dfmat2 <- corp2 %>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(patter = stopwords("en")) %>%
  dfm() %>%
  dfm_trim(min_termfreq = 5)

embeddings <- umap(as.matrix(dfmat2)) 

df$x <- embeddings[,1]
df$y <- embeddings[,2]

colordict <- c( "Democrat"="blue","Republican"="red", "Both"="yellow")

p <- ggplot(df, aes(x, y, fill=party)) + 
  geom_point(color="grey", shape=21, size=0.5) + 
  scale_fill_manual(values=colordict) +
  theme_bw()


p <- ggplotly(p)
p
df1 <- df %>%
  mutate(party_full = ifelse(cosponsor_D_perc == 1.0, "Dem",
                             ifelse(cosponsor_R_perc == 1.0, "Rep", NA))) %>%
  drop_na(party_full)

corp3 <- corpus(df1$summary)

dfmat3 <- corp3 %>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(patter = stopwords("en")) %>%
  dfm() %>%
  dfm_trim(min_termfreq = 5)

embeddings2 <- umap(as.matrix(dfmat3)) 

df1$x <- embeddings2[,1]
df1$y <- embeddings2[,2]

colordict2 <- c( "Democrat"="blue","Republican"="red")

j <- ggplot(df1, aes(x, y, fill=party)) + 
  geom_point(color="grey", shape=21, size=0.5) + 
  scale_fill_manual(values=colordict2) +
  theme_bw()

j <- ggplotly(j)
j

##Sentiment analysis

summary_sentiment <- read_csv("https://raw.githubusercontent.com/juka19/tad_assignment3/main/data/data_w_vader.csv")
## New names:
## Rows: 920 Columns: 18
## -- Column specification
## -------------------------------------------------------- Delimiter: "," chr
## (3): subjects, summary, policy_area dbl (13): ...1, Unnamed: 0, ...3, ...4,
## Unnamed: 0.1, bill number, cosponso... date (2): latest_action, date
## i Use `spec()` to retrieve the full column specification for this data. i
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## * `` -> `...1`
## * `...1` -> `...3`
## * `...2` -> `...4`
summary_sentiment$party <- ifelse(summary_sentiment$cosponsor_D_perc > 0.66, "Democrat", ifelse(summary_sentiment$cosponsor_R_perc > 0.66, "Republican", "Both"))
  
wide_sentiment <- summary_sentiment %>%
  group_by(party, date) %>%
  summarise(score = mean(compound)) %>% 
  pivot_wider(names_from = party, values_from = score) %>%
  select(-c("Both", "NA"))
## `summarise()` has grouped output by 'party'. You can override using the
## `.groups` argument.
days <- data.frame(date = seq(as.Date("2017-01-01"),as.Date("2022-12-31"),1))
daily_sentiment <- days %>% 
  left_join(wide_sentiment) %>% 
  pivot_longer(cols = -date, names_to="party", values_to="score")
## Joining, by = "date"
p4 <- ggplot(daily_sentiment, aes(x=date, y = score, colour=party)) +
  geom_point(aes(y=score), size=1) + 
  theme_minimal() + 
  geom_smooth(method = "loess", se = FALSE)+
  scale_color_manual(values = c("blue","red"))
ggplotly(p4)
## `geom_smooth()` using formula 'y ~ x'

Network graph of 116 House members

df_labels <- df_cosp %>% 
  inner_join(
    df, 
    by = c("number"= "bill number")
  ) %>% 
  inner_join(
    df_bills %>% select(number, originChamber),
    by = 'number'
  ) %>% 
  filter(session == 116, originChamber == 'House') %>% 
  mutate(
    cosponsor_name = str_replace_all(cosponsor_name, '\\w\\.\\s', ''),
    cosponsor_name = str_replace_all(tolower(cosponsor_name), " ", "_")) 

df_labels %>% 
  group_by(number) %>% 
  summarize(cosps = paste(cosponsor_name, collapse = " ")) -> df_grouped

corpus(df_grouped$cosps) %>% 
  tokens() %>% 
  fcm() %>% 
  as.matrix() %>% 
  graph_from_adjacency_matrix(mode = "undirected", weighted = T) %>% 
  toVisNetworkData() -> visn

visn$edges$value <- visn$edges$weight

edges <- data.frame(visn$edges) %>% 
  filter(weight > 25)
nodes <- data.frame(visn$nodes) %>% 
  left_join(df_labels %>% select(cosponsor_name, cosponsor_party) %>% distinct(), by = c('label' = 'cosponsor_name')) %>% 
  filter(!duplicated(label)) %>% 
  rename('group' = 'cosponsor_party') %>% 
  mutate(color.background = if_else(group == "R",  "darkred", "darkblue"),
         shadow = T,
         font.color = 'black',
         borderWidth = 2) %>% 
  filter(label %in% edges$from | label %in% edges$to) %>% 
  mutate(label = str_replace_all(label, '_',  ' ') %>% tools::toTitleCase(),
         title = label)

network <- visNetwork(nodes, edges) %>% 
  visNodes(color = list(border = "darkgrey", 
                        borderWidth = 2,
                        highlight = list(border = 'darkgrey', background = 'orange'))
           ) %>% 
  visEdges(color = list(color = '#d3d3d3', highlight = 'orange')) %>% 
  visPhysics(maxVelocity = 3)

network

Topic modeling

umap <- read.csv('data\\umap_projection_data.csv')

umap %>% 
  ggplot(aes(x, y, color = cosponsor_D_perc,
             text = paste('Title:' , title, 
                          '<br>Policy Area: ', policyarea, 
                          '<br>Session: ', session, 
                          '<br>Topic: ', cluster))) +  
  geom_point() + 
  scale_color_gradient2(midpoint = .5, name = "Democrat share") + 
  theme_minimal() + 
  labs(x = '', y = '', title = "UMAP Projection of US Congress bill summaries 115-117") +
  theme(panel.background = element_rect(fill='#fbfbfb', colour='#fbfbfb')) -> p 

ggp <- ggplotly(p)

ggp

Conclusion

  • In general the legilsation appears quite similar, with exceptions limited to specific policy areas and topics
  • Democratic and Republican cooperation across the different congress sessions is highly dependent on the policy area and topic of the bill
  • There are some key actors that enable working across the aisle and are known for bipartisan collaboration